1 import pandas as pd
2 import numpy as np
3
4 # merge合并 ,类似于Excel中的vlookup
5
6 df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
7 'A': ['A0', 'A1', 'A2', 'A3'],
8 'B': ['B0', 'B1', 'B2', 'B3']})
9 df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
10 'C': ['C0', 'C1', 'C2', 'C3'],
11 'D': ['D0', 'D1', 'D2', 'D3']})
12 df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K2', 'K3'],
13 'key2': ['K0', 'K1', 'K0', 'K1'],
14 'A': ['A0', 'A1', 'A2', 'A3'],
15 'B': ['B0', 'B1', 'B2', 'B3']})
16 df4 = pd.DataFrame({'key1': ['K0', 'K0', 'K2', 'K3'],
17 'key2': ['K0', 'K0', 'K0', 'K0'],
18 'C': ['C0', 'C1', 'C2', 'C3'],
19 'D': ['D0', 'D1', 'D2', 'D3']})
20 print(pd.merge(df1,df2,on='key'))
21 # 第一个DataFrame为拼接后左边的
22 # 第二个DataFrame为拼接后右边的
23 # on 为参考键
24 '''
25 key A B C D
26 0 K0 A0 B0 C0 D0
27 1 K1 A1 B1 C1 D1
28 2 K2 A2 B2 C2 D2
29 3 K3 A3 B3 C3 D3
30 '''
31 # 多个键连接
32 print(pd.merge(df3, df4, on=['key1', 'key2']))
33 # 当两个DataFrame中的key1和key2都相同时,才会连,否则不连
34 '''
35 key1 key2 A B C D
36 0 K0 K0 A0 B0 C0 D0
37 1 K0 K0 A0 B0 C1 D1
38 2 K2 K0 A2 B2 C2 D2
39 '''
40 # 参数how , 合并方式
41 # 默认,取交集
42 print(pd.merge(df3, df4, on=['key1', 'key2'], how='inner'))
43 print('-' * 8)
44 '''
45 key1 key2 A B C D
46 0 K0 K0 A0 B0 C0 D0
47 1 K0 K0 A0 B0 C1 D1
48 2 K2 K0 A2 B2 C2 D2
49 --------
50 '''
51 # 取并集,outer,数据缺失范围NaN
52 print(pd.merge(df3, df4, on=['key1', 'key2'], how='outer'))
53 print('-' * 8)
54 '''
55 key1 key2 A B C D
56 0 K0 K0 A0 B0 C0 D0
57 1 K0 K0 A0 B0 C1 D1
58 2 K0 K1 A1 B1 NaN NaN
59 3 K2 K0 A2 B2 C2 D2
60 4 K3 K1 A3 B3 NaN NaN
61 5 K3 K0 NaN NaN C3 D3
62 --------
63 '''
64 # 参照df3为参考合并,数据缺失范围NaN
65 print(pd.merge(df3, df4, on=['key1', 'key2'], how='left'))
66 print('-' * 8)
67 '''
68 key1 key2 A B C D
69 0 K0 K0 A0 B0 C0 D0
70 1 K0 K0 A0 B0 C1 D1
71 2 K0 K1 A1 B1 NaN NaN
72 3 K2 K0 A2 B2 C2 D2
73 4 K3 K1 A3 B3 NaN NaN
74 --------
75 '''
76 # 参照df4为参考合并,数据缺失范围NaN
77 print(pd.merge(df3, df4, on=['key1', 'key2'], how='right'))
78 print('-' * 8)
79 '''
80 key1 key2 A B C D
81 0 K0 K0 A0 B0 C0 D0
82 1 K0 K0 A0 B0 C1 D1
83 2 K2 K0 A2 B2 C2 D2
84 3 K3 K0 NaN NaN C3 D3
85 --------
86 '''
87 # 参数left_on,right_on,left_index, right_index ,当键不为一个列时,可以单独设置左键与右键
88 df5 = pd.DataFrame({'lkey': list('bbacaab'),
89 'data1': range(7)})
90 df6 = pd.DataFrame({'rkey': list('abd'),
91 'date2': range(3)})
92 print(df5)
93 print(df6)
94 print(pd.merge(df5,df6,left_on='lkey',right_on='rkey'))
95 '''
96 lkey data1
97 0 b 0
98 1 b 1
99 2 a 2
100 3 c 3
101 4 a 4
102 5 a 5
103 6 b 6
104 rkey date2
105 0 a 0
106 1 b 1
107 2 d 2
108 lkey data1 rkey date2
109 0 b 0 b 1
110 1 b 1 b 1
111 2 b 6 b 1
112 3 a 2 a 0
113 4 a 4 a 0
114 5 a 5 a 0
115 '''
116
117 # concat() 连接,默认axis=0 行+行,当axis=1时,列+列 成为Dataframe
118 s1 = pd.Series([2, 3, 4])
119 s2 = pd.Series([1, 2, 3])
120 print(pd.concat([s1, s2]))
121 '''
122 0 2
123 1 3
124 2 4
125 0 1
126 1 2
127 2 3
128 dtype: int64
129 '''
130 print(pd.concat([s1,s2],axis=1))
131 '''
132 0 1
133 0 2 1
134 1 3 2
135 2 4 3
136 '''
137 snew = pd.concat([s1, s2], axis=1)
138 snew.reset_index(inplace=True)
139 print(snew)
140 '''
141 index 0 1
142 0 0 2 1
143 1 1 3 2
144 2 2 4 3
145 '''
146 snew2 = pd.concat([s1, s2], axis=1)
147 snew2.reset_index(inplace=True, drop=True)
148 print(snew2)
149 '''
150 0 1
151 0 2 1
152 1 3 2
153 2 4 3
154 '''
155
156 # 去重 .duplicated()
157 s3 = pd.Series([1, 2, 2, 4, 4, 6, 7, 6, 87])
158 # 判断是否重复
159 print(s3.duplicated())
160 '''
161 0 False
162 1 False
163 2 True
164 3 False
165 4 True
166 5 False
167 6 False
168 7 True
169 8 False
170 dtype: bool
171 '''
172 # 取出重复的值
173 s4 = s3[s3.duplicated()]
174 print(s4)
175 # 取出唯一的元素
176 s5 = s3[s3.duplicated() == False]
177 print(s5)
178 '''
179 0 1
180 1 2
181 3 4
182 5 6
183 6 7
184 8 87
185 dtype: int64
186 '''
187 s5 = s3.drop_duplicates()
188 # 可以通过设置参数:inplace控制是否替换原先的值
189 print(s5)
190 '''
191 0 1
192 1 2
193 3 4
194 5 6
195 6 7
196 8 87
197 dtype: int64
198 '''
199 df7 = pd.DataFrame({'key1':['a','a',3,4,3],
200 'key2':['a','a','b','b',5]})
201 print(df7.duplicated())
202 # 按行检测,第二次出现时,返回True
203 '''
204 0 1
205 1 2
206 3 4
207 5 6
208 6 7
209 8 87
210 dtype: int64
211 '''
212 # 今查看key2列
213 print(df7['key2'].duplicated())
214 '''
215 0 False
216 1 True
217 2 False
218 3 True
219 4 False
220 Name: key2, dtype: bool
221 '''
222 # 直接去重
223 print(df7.drop_duplicates())
224 '''
225 key1 key2
226 0 a a
227 2 3 b
228 3 4 b
229 4 3 5
230 '''
231 print(df7['key2'].drop_duplicates())
232 '''
233 0 a
234 2 b
235 4 5
236 Name: key2, dtype: object
237 '''
238
239 # 替换 .replace()
240 s6 = pd.Series(list('askjdghs'))
241 # 一次性替换一个值
242 # print(s6.replace('s','dsd'))
243 '''
244 0 a
245 1 dsd
246 2 k
247 3 j
248 4 d
249 5 g
250 6 h
251 7 dsd
252 dtype: object
253 '''
254 # 一次性替换多个值
255 print(s6.replace(['a','s'],np.nan))
256 '''
257 0 NaN
258 1 NaN
259 2 k
260 3 j
261 4 d
262 5 g
263 6 h
264 7 NaN
265 dtype: object
266 '''
267 # 通过字典的形式替换值
268 print(s6.replace({'a':np.nan}))
269 '''
270 0 NaN
271 1 s
272 2 k
273 3 j
274 4 d
275 5 g
276 6 h
277 7 s
278 dtype: object
279
280 '''